# Description ------------------------------------------------------------------

### This cleans the baseline tax collector survey variables needed for analysis 
### for the TAD Impact Evaluation Project, merges in treatment status, then 
### saves cleaned version as a .Rdata file

# Settings ---------------------------------------------------------------------
data_path <- 'data/1_raw/'
clean_path <- 'data/2_clean/'

# Packages ---------------------------------------------------------------------
#if packages are not installed, they must be installed with:
# install.packages("package_name"); The name of package must be in quotes
library(dplyr)
library(readr)
library(stringr)
library(haven)
library(labelled)
library(hms)

# Convenience Functions --------------------------------------------------------
source("scripts/0_functions/functions_cleaning.R")

# Cleaning ---------------------------------------------------------------------

# Import uncleaned tax collector data
#tax collector baseline survey data
tax_collector_base <- read_dta(paste0(data_path, 
                                      'taxcollector_BASELINEFINAL.dta'))
#block id
block_ids <- read_dta(paste0(data_path, "block_ids.dta")) %>% 
  mutate_if(is.labelled, to_factor)

#creating hrs_in_mkt variable for baseline
tax_collector_base <- tax_collector_base %>% 
  mutate(hrs_in_mkt = e11 - e10)

#other variables to faciliate baseline and endline merge
tax_collector_base <- tax_collector_base %>% 
  mutate(vendors_visited = e12,
         vendors_visited_trim_99 = 
           ifelse(vendors_visited > quantile(vendors_visited, probs = .99),
                  NA, vendors_visited),
         vendors_visited_top_99 = ifelse(vendors_visited > 
                                           quantile(vendors_visited,
                                                    probs = .99,
                                                    na.rm = T),
                                         quantile(vendors_visited,
                                                  probs = .99,
                                                  na.rm = T),
                                         vendors_visited))


#demographic variables
tax_collector_base <- tax_collector_base %>% 
  mutate(gender = neg_to_na(r1),
         female = ifelse(gender == 1, 1, 0),
         age = neg_to_na(d8, outlier_top = 200),
         literacy = rev_values(neg_to_na(d14)),
         literacy_high = ifelse(literacy == 4, 1, 0),
         literacy_any = ifelse(literacy == 3 | literacy == 4, 1, 0),
         education = ifelse(d12 == 888, 0, neg_to_na(d12) + 1),
         marital_status = neg_to_na(d6),
         married = ifelse(marital_status == 2 | 
                            marital_status == 3, 1, 0),
         spk_english = rev_values(neg_to_na(d16)),
         no_english = ifelse(spk_english == 1, 1, 0),
         days_wrk_mkt = neg_to_na(e9),
         mnthly_earn = neg_to_na(e20),
         hh_income = neg_to_na(e25))

education_lab <- c(None = 0, "Nursery School" = 1,
                   "Standard 1" = 2, "Standard 2" = 3,
                   "Standard 3" = 4, "Standard 4" = 5,
                   "Standard 5" = 6, "Standard 6" = 7,
                   "Standard 7" = 8, "Standard 8" = 9,
                   "Form 1" = 10, "JCE/Form 2" = 11,
                   "Form 3" = 12, "MSCE/Form 4" = 13,
                   "Technical/Private College (non-Degree)" = 16,
                   Degree = 17, Masters = 18, PhD = 19)
literacy_lab <- c("Could not read" = 1, 
                  "Could read some of the card" = 2,
                  "Could read the whole card with difficulty" = 3,
                  "Could read the whole card with ease" = 4)

#NA values labels that have to be removed for factor purposes
na_vec <- c(-66, -77, -88, -99)

#removing NA values for factor purposes
tax_collector_base <- tax_collector_base %>% remove_NA_labels(na_vec, start = 348,
                                                            end = 361)

val_labels(tax_collector_base) <- list(education = education_lab,
                                      literacy = literacy_lab)

# spillover questions
tax_collector_base <- tax_collector_base %>% 
  mutate(collect_othr_mkts = neg_to_na(e18),
         jobs_othr_mkts = neg_to_na(e6),
         othrs_ask_transfer_away = neg_to_na(oa1),
         othrs_succ_transfer_away = neg_to_na(oa2),
         othrs_ask_transfer_here = neg_to_na(oa4),
         othrs_succ_transfer_here = neg_to_na(oa5),
         transfer_btwn_mkts = neg_to_na(oa7),
         ask_transfer_better_pay = neg_to_na(oa9_1),
         ask_transfer_job_easier = neg_to_na(oa9_2),
         ask_transfer_personal_reason = neg_to_na(oa9_3),
         transferred_promotion = neg_to_na(oa9_b_1),
         transferred_mkt_needed_staff = neg_to_na(oa9_b_2),
         transferred_unknown = neg_to_na(oa9_b_3),
         compl_surv_othr_mkt = neg_to_na(final1),
         final2_clean = neg_to_na(final2),
         date_transfer = oa8,
         date_othr_surv = final2a)

#removing NA values for factor purposes
tax_collector_base <- tax_collector_base %>% remove_NA_labels(na_vec, start = 362,
                                                            end = 376)

#compliance questions
tax_collector_base <- tax_collector_base %>% 
  mutate(meeting_held = ifelse(neg_to_na(te1) == 1, 1, 0),
         meeting_attended = ifelse(neg_to_na(te2) == 1, 1, 0),
         meeting_disc_dev = te3_1,
         meeting_disc_pay_fees = te3_2,
         meeting_disc_corrup = te3_3,
         meeting_disc_use_fees = te3_4,
         meeting_disc_sell_outside = te3_5,
         ms_disc_toilets = te3_ms_1,
         ms_disc_water = te3_ms_2,
         ms_disc_security = te3_ms_3,
         ms_disc_roofs_stalls = te3_ms_4,
         ms_disc_paths = te3_ms_5,
         ms_disc_trash = te3_ms_6,
         rev_info_system = rev_values(neg_to_na(te8)), #te label (- 1 when num)
         how_acc_info = rev_values(neg_to_na(te10)), #accurate label
         info_source_sms = te9_1,
         info_source_poster = te9_2,
         info_source_told = te9_3,
         info_source_mkt_com = te9_4,
         info_source_mkt_mang_fc = te9_5,
         any_constr = ifelse(neg_to_na(te5) == 1, 1, 0),
         constr_toilets = te6_1,
         constr_water = te6_2,
         constr_security = te6_3,
         constr_roofs_stalls = te6_4,
         constr_paths = te6_5,
         constr_trash = te6_6,
         who_resp_constr_mkt_mang = te7_1,
         who_resp_constr_wc = te7_2,
         who_resp_constr_dg = te7_3,
         who_resp_constr_mp = te7_4,
         who_resp_constr_USAID = te7_5,
         who_resp_constr_vendors = te7_6,
         how_paid = neg_to_na(e15), #paid label
         paid_incentives = ifelse(how_paid == 2, 1, 0),
         paid_all_commision = ifelse(how_paid == 4, 1, 0),
         monthly_earn = e17,
         what_happens_to_fees = neg_to_na(oa16), #fees_go label
         fees_dir_mobile_mon = ifelse(what_happens_to_fees == 3, 1, 0),
         how_sup_transf = neg_to_na(oa17), #transfer label
         mobile_money = ifelse(how_sup_transf == 1, 1, 0),
         how_oft_sup_transf = rev_values(neg_to_na(oa18)), #freq_transfer label
         vendor_counting = ifelse(te11 == 1, 1, 0),
         who_sent_counter = neg_to_na(te12), #counter label
         how_often_sup_visit = rev_values(neg_to_na(oa11)) #zonevisit label
  )

#removing NA values for factor purposes
tax_collector_base <- tax_collector_base %>% remove_NA_labels(na_vec, start = 379)


# relabel
te13_label <- c("No" = 1,
                "Yes - can find out how spent only" = 2,
                "Yes - can find out amount collected only" = 3,
                "Yes - find out both amount collect and how it was spent" = 4)  

accurate_label <- c("Very inaccurate" = 1, 
                    "Somewhat inaccurate" = 2,
                    "Somewhat accurate" = 3, 
                    "Very accurate" = 4)

fees_go_lab <- c("Bring them to the market manager" = 1,
                 "Bring them to the district revenue official" = 2,
                 "Give them to a mobile money vendor" = 3,
                 "Keep them until supervisor visits market to collect them (everyday)" = 4,
                 "Keep them until supervisor visits market to collect them (several days at once)" = 5,
                 "Deposit them at the bank" = 6)

paid_label <- c("A fixed salary, no matter how much you collect in fees" = 1,
                "A fixed salary, plus incentives if you reach certain targets" = 2,
                "A fixed salary, plus a certain percentage of what you collect" = 3,
                "No fixed salary, it just depends on how much you collect" = 4)

transfer_lab <- c("Via mobile money" = 1,
                  "Goes in person" = 2,
                  "Gives it to their supervisor who take the money in person" = 3,
                  "Deposit them at the bank" = 4)

freq_trans_lab <- c("Once a year" = 1,
                    "Once every few months" = 2,
                    "Once a month" = 3,
                    "A few times a month (2-3 times per month)" = 4,
                    "Once a week" = 5,
                    "Every 5-6 days" = 6,
                    "Every 3-4 days" = 7,
                    "Every other day" = 8,
                    "Every Day" = 9)

counter_talk_lab <- c("No, never" = 1,
                      "Yes, sometimes" = 2, 
                      "Yes, every time they come" = 3)

counter_freq_lab <- c("Once a year" = 1,
                      "A few times a year (2-4 times)" = 2,
                      "At least once a month" = 3,
                      "More than once a month" = 4)

counter_lab <- c("District government" = 1,
                 "Hired by market" = 2)

zonevisit_lab <- c("Never (in the past year)" = 1,
                   "Once in the past year" = 2,
                   "At least twice per year" = 3,
                   "At least 4 times in the past year" = 4,
                   "At least once per month" = 5,
                   "At least twice per month" = 6,
                   "Once a week or more" = 7,
                   "Daily" = 8)

val_labels(tax_collector_base) <- list(rev_info_system = te13_label,
                                      how_acc_info = accurate_label,
                                      how_paid = paid_label,
                                      what_happens_to_fees = fees_go_lab,
                                      how_sup_transf = transfer_lab,
                                      how_oft_sup_transf = freq_trans_lab,
                                      who_sent_counter = counter_lab,
                                      how_often_sup_visit = zonevisit_lab)


#convert labelled integers to factors
tax_collector_base <- mutate_if(tax_collector_base, is.labelled, to_factor)

#for analysis of baseline list experiment
tax_collector_base <- tax_collector_base %>% 
  mutate(list_treat = ifelse(!is.na(list_treatment) & 
                               is.na(list_control), 
                             1, 
                             ifelse(!is.na(list_control) &
                                      is.na(list_treatment),
                                    0, NA)),
         list_outcome = ifelse(!is.na(list_treatment) & 
                                 is.na(list_control) &
                                 list_treatment >= 0, 
                               list_treatment, 
                               ifelse(!is.na(list_control) &
                                        is.na(list_treatment) &
                                        list_control >= 0,
                                      list_control, NA)))

#read in treatment assignments
treatment_groups <- read_csv(paste0(clean_path, 'treatment_groups.csv'))

#merge in treatment assignment
tax_collector_base <- left_join(tax_collector_base, treatment_groups,
                                by = c("market" = "Market", "district" = "District"))

#create BU and TD treatment indicators
tax_collector_base <- mutate(tax_collector_base,
                             BU_treat = str_detect(treatment_status, "BU|BOTH") %>% 
                               as.numeric(),
                             TD_treat = str_detect(treatment_status, "TD|BOTH") %>% 
                               as.numeric(),
                             BU = ifelse(BU_treat == 1 & TD_treat == 0,
                                         1, 0),
                             TD = ifelse(BU_treat == 0 & TD_treat == 1,
                                         1, 0),
                             Both = ifelse(BU_treat == 1 & TD_treat == 1,
                                           1, 0))

#merge in treatment assignment block ids
tax_collector_base <- left_join(tax_collector_base, block_ids,
                                by = c("market" = "market", "district" = "district"))

#making enum and sup numeric to faciliate merge
tax_collector_base <- tax_collector_base %>% 
  mutate(#for merging with endline, sup has to be made numeric
    sup = as.numeric(sup) + 10,
    #for merging with endline, enumerator has to be made numeric and turned into unique ID,
    enum = as.numeric(enum) + 100)

#also turn some factors into numeric
tax_collector_base <- tax_collector_base %>% 
  mutate(how_acc_info_num = as.numeric(how_acc_info),
         how_paid_num = as.numeric(how_paid),
         how_oft_sup_transf_num = as.numeric(how_oft_sup_transf),
         how_often_sup_visit_num = as.numeric(how_often_sup_visit),
         rev_info_system_dich = ifelse(rev_info_system == "No", 0, 1))

#making education numeric
tax_collector_base$educ_num <- as.numeric(tax_collector_base$education) - 1
tax_collector_base$educ_none <- 1*(tax_collector_base$educ_num == 0)
tax_collector_base <- tax_collector_base %>% 
  mutate(educ_cat = ifelse(educ_num > 0 & educ_num <= 9, 1,
                           ifelse(educ_num > 9 & educ_num <= 13, 2,
                                  ifelse(educ_num > 13, 3, educ_num))))

# Saving -----------------------------------------------------------------------
#save as RData and STATA .dta files
save(tax_collector_base, file = "data/2_clean/tax_collector_base.RData")


